nyc_zips =
read_csv("data/cleaned_shapes/nyc_zip_codes.csv") |>
mutate(zip = as.character(ZipCode)) |>
select(zip)
resp_df =
read_csv("data/raw_data/respiratory.csv")
asth_df =
read_csv("data/raw_data/asthma.csv")
resp_df_tidy =
resp_df |>
janitor::clean_names() |>
mutate(age_group = dim2value,
count_resp = count) |>
select(zip, age_group, date, count_resp)
asth_df_tidy =
asth_df |>
janitor::clean_names() |>
mutate(age_group = dim2value,
count_asth = count) |>
select(zip, age_group, date, count_asth)
df_all_resp =
merge(resp_df_tidy, asth_df_tidy, by = c("zip", "date", "age_group"))
df_all_resp |>
write_csv("data/joined_respiratory.csv")
joined_respiratory.csv and
air_quality_disease.csv# Load dataset with respiratory and asthma counts (missing some zipcodes)
dis_asth_df = read_csv("data/cleaned_data/joined_respiratory.csv") |>
filter(zip != 88888 & zip != "Citwide" & age_group == "All age groups") |>
mutate(date = as.Date(format(as.Date(date, "%m/%d/%y"))),
year = as.numeric(format(date, format = "%Y")),
month = month.name[as.numeric(format(date, format = "%m"))],
day = as.numeric(format(date, format = "%d")),
zip = as.numeric(zip)) |>
select(year, month, day, zip, count_resp, count_asth)
# Load dataset with pneumonia data
dis_pneu_df = read_csv("data/raw_data/disease_hospital_admin.csv") |>
separate(date, into=c("month", "day", "year")) |>
mutate(day = as.numeric(day),
month = month.name[as.numeric(month)],
year = as.numeric(year)) |>
rename(zip = mod_zcta) |>
select(year, month, day, zip, total_ed_visits, ili_pne_visits, ili_pne_admissions)
# Merge two disease datasets
all_dis_df =
full_join(dis_asth_df, dis_pneu_df, by = c("month", "day", "year", "zip"))
# Now merge disease with air quality dataset
air_qual_df = read_csv("data/cleaned_data/alt_air_data.csv") |>
separate(date, into=c("month", "day", "year")) |>
mutate(day = as.numeric(day),
month = month.name[as.numeric(month)],
year = as.numeric(year)) |>
filter(!(year == '2020' & month == 'January'),
!(year == '2020' & month == 'February'),
!(aqs_parameter_desc == "Acceptable PM2.5 AQI & Speciation Mass")) |>
rename(zip = zip_code) |>
select(year, month, day, zip, daily_aqi_value, pollutant, value) |>
group_by(day, month, year, zip, pollutant) |>
summarize(value = mean(value))
# air_qual_df_aqi = read_csv("data/cleaned_data/alt_air_data.csv") |>
# separate(date, into=c("month", "day", "year")) |>
# mutate(day = as.numeric(day),
# month = month.name[as.numeric(month)],
# year = as.numeric(year)) |>
# filter(!(year == '2020' & month == 'January'),
# !(year == '2020' & month == 'February'),
# !(aqs_parameter_desc == "Acceptable PM2.5 AQI & Speciation Mass")) |>
# rename(zip = zip_code) |>
# select(year, month, day, zip, daily_aqi_value, pollutant, value) |>
# group_by(day, month, year, zip, pollutant) |>
# summarize(daily_aqi = mean(daily_aqi_value))
### note: removed acceptable pm2.5... from https://www.epa.gov/aqs/aqs-memos-technical-note-reporting-pm25-continuous-monitoring-and-speciation-data-air-quality
# Finally merge disease and air qual data together
air_qual_dis = # with value
full_join(all_dis_df, air_qual_df, by = c("month", "day", "year", "zip")) |>
mutate(
numeric_month = match(month, month.name),
date = as.Date(paste(year, numeric_month, day, sep = "-"), format = "%Y-%m-%d")
) |>
select(year, month, day, date, zip, count_resp, count_asth, total_ed_visits,
ili_pne_visits, ili_pne_admissions, pollutant, value) |>
pivot_wider(names_from = pollutant,
values_from = value) |>
select(-`NA`) |>
rename(pm25_ug_m3_lc = PM2.5,
co_ppm = CO,
o3_ppm = Ozone,
no2_ppb = NO2) |>
filter(!(year == '2022' & month == 'December'))
# air_qual_dis_aqi = # with daily aqi value
# full_join(all_dis_df, air_qual_df_aqi, by = c("month", "day", "year", "zip")) |>
# select(year, month, day, zip, count_resp, count_asth, total_ed_visits,
# ili_pne_visits, ili_pne_admissions, pollutant, daily_aqi) |>
# pivot_wider(names_from = pollutant,
# values_from = daily_aqi) |>
# select(-`NA`) |>
# rename(pm25_ug_m3_lc = PM2.5,
# co_ppm = CO,
# o3_ppm = Ozone,
# no2_ppb = NO2) |>
# filter(!(year == '2022' & month == 'December')) |>
# mutate(
# numeric_month = match(month, month.name),
# date = as.Date(paste(year, numeric_month, day, sep = "-"), format = "%Y-%m-%d")
# )
# measurement: ozone (daily_max_8_hour_ozone_concentration)
# co (daily_max_8_hour_co_concentration)
# pm2.5 (daily_mean_pm2_5_concentration)
# no2 (daily_max_1_hour_no2_concentration)
airqual_annual = read_csv("data/cleaned_data/uhf_airquality.csv") |>
filter(str_detect(name, "Fine") | str_detect(name, "NO2") | str_detect(name, "O3"))
airqual_annual = read_csv("data/cleaned_data/uhf_airquality.csv") |>
filter(str_detect(name, "Fine"))
table(airqual_annual$time_period)
##
## Annual Average 2009 Annual Average 2010 Annual Average 2011 Annual Average 2012
## 42 42 42 42
## Annual Average 2013 Annual Average 2014 Annual Average 2015 Annual Average 2016
## 42 42 42 42
## Annual Average 2017 Annual Average 2018 Annual Average 2019 Annual Average 2020
## 42 42 42 42
## Annual Average 2021 Summer 2009 Summer 2010 Summer 2011
## 42 42 42 42
## Summer 2012 Summer 2013 Summer 2014 Summer 2015
## 42 42 42 42
## Summer 2016 Summer 2017 Summer 2018 Summer 2019
## 42 42 42 42
## Summer 2020 Summer 2021 Winter 2008-09 Winter 2009-10
## 42 42 42 42
## Winter 2010-11 Winter 2011-12 Winter 2012-13 Winter 2013-14
## 42 42 42 42
## Winter 2014-15 Winter 2015-16 Winter 2016-17 Winter 2017-18
## 42 42 42 42
## Winter 2018-19 Winter 2019-20 Winter 2020-21
## 42 42 42
#### Prepare dataset from daily data (Summer: 1June-31Aug, Winter 1Dec-28Feb)
# Annual averages for each pollutant and each zip
airqual_yr_avg = air_qual_df |>
group_by(year, zip, pollutant) |>
summarize(mean = mean(value)) |>
pivot_wider(
names_from = pollutant,
values_from = mean
)
# Seasonal averages for each pollutant and each zip
airqual_szn_avg = air_qual_df |>
mutate(season = case_when(
month %in% c("June", "July", "August") ~ "summer",
month %in% c("December", "January", "February") ~ "winter",
TRUE ~ "other"
)) |>
filter(season != "other") |>
group_by(season, year, zip, pollutant) |>
summarize(mean = mean(value)) |>
pivot_wider(
names_from = pollutant,
values_from = mean
)
#### Prepare dataset from daily data
airqual_annual = read_csv("data/cleaned_data/uhf_airquality.csv") |>
filter(str_detect(name, "Fine") | str_detect(name, "NO2") | str_detect(name, "O3")) |>
select(name, )
#### Comparison between daily data and yearly/seasonal data
Let us first visualize how the number of emergency department visits change over time. We can see that daily ER visits seem to peak in winter months except in 2021. This could be due to COVID restrictions.
all_dis_plot_df = all_dis_df |>
mutate(
numeric_month = match(month, month.name),
date = as.Date(paste(year, numeric_month, day, sep = "-"), format = "%Y-%m-%d")
) |>
select(-day, -month, -year, -numeric_month) |>
pivot_longer(cols = c("count_resp", "count_asth", "total_ed_visits", "ili_pne_visits", "ili_pne_admissions"),
names_to = "variable",
values_to = "count")
er_plot = all_dis_plot_df |>
filter(variable == "total_ed_visits") |>
plot_ly(x = ~date, y = ~count,
type = 'scatter',
mode = 'markers',
text = ~paste("Zip: ", zip, "<br>Date:", date, "<br>Count:", count),
color = "viridis") |>
layout(title = 'Daily total ER visits per zip', plot_bgcolor = "#e5ecf6",
xaxis = list(title = "Date"),
yaxis = list(title = "Counts"))
er_plot